import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore
df=pd.read_csv('fl.csv')
df
| DAY_OF_MONTH | DAY_OF_WEEK | OP_UNIQUE_CARRIER | OP_CARRIER_AIRLINE_ID | OP_CARRIER | TAIL_NUM | OP_CARRIER_FL_NUM | ORIGIN_AIRPORT_ID | ORIGIN_AIRPORT_SEQ_ID | ORIGIN | ... | DEST | DEP_TIME | DEP_DEL15 | DEP_TIME_BLK | ARR_TIME | ARR_DEL15 | CANCELLED | DIVERTED | DISTANCE | Unnamed: 21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | EV | 20366 | EV | N48901 | 4397 | 13930 | 1393007 | ORD | ... | GRB | 1003.0 | 0.0 | 1000-1059 | 1117.0 | 0.0 | 0.0 | 0.0 | 174.0 | NaN |
| 1 | 1 | 3 | EV | 20366 | EV | N16976 | 4401 | 15370 | 1537002 | TUL | ... | ORD | 1027.0 | 0.0 | 1000-1059 | 1216.0 | 0.0 | 0.0 | 0.0 | 585.0 | NaN |
| 2 | 1 | 3 | EV | 20366 | EV | N12167 | 4404 | 11618 | 1161802 | EWR | ... | TYS | 1848.0 | 0.0 | 1800-1859 | 2120.0 | 0.0 | 0.0 | 0.0 | 631.0 | NaN |
| 3 | 1 | 3 | EV | 20366 | EV | N14902 | 4405 | 10781 | 1078105 | BTR | ... | IAH | 1846.0 | 0.0 | 1800-1859 | 2004.0 | 0.0 | 0.0 | 0.0 | 253.0 | NaN |
| 4 | 1 | 3 | EV | 20366 | EV | N606UX | 4407 | 14524 | 1452401 | RIC | ... | IAH | 1038.0 | 0.0 | 1000-1059 | 1330.0 | 0.0 | 0.0 | 0.0 | 1157.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 607341 | 31 | 5 | 9E | 20363 | 9E | N331CA | 4812 | 15412 | 1541205 | TYS | ... | DTW | 1002.0 | 1.0 | 0700-0759 | 1128.0 | 1.0 | 0.0 | 0.0 | 443.0 | NaN |
| 607342 | 31 | 5 | 9E | 20363 | 9E | N295PQ | 4813 | 11433 | 1143302 | DTW | ... | JFK | 1747.0 | 0.0 | 1700-1759 | 1933.0 | 0.0 | 0.0 | 0.0 | 509.0 | NaN |
| 607343 | 31 | 5 | 9E | 20363 | 9E | N294PQ | 4814 | 11996 | 1199603 | GSP | ... | LGA | 554.0 | 0.0 | 0600-0659 | 752.0 | 0.0 | 0.0 | 0.0 | 610.0 | NaN |
| 607344 | 31 | 5 | 9E | 20363 | 9E | N228PQ | 4815 | 10397 | 1039707 | ATL | ... | XNA | 1714.0 | 0.0 | 1700-1759 | 1811.0 | 0.0 | 0.0 | 0.0 | 589.0 | NaN |
| 607345 | 31 | 5 | 9E | 20363 | 9E | N228PQ | 4815 | 15919 | 1591904 | XNA | ... | ATL | 1841.0 | 0.0 | 1800-1859 | 2128.0 | 0.0 | 0.0 | 0.0 | 589.0 | NaN |
607346 rows × 22 columns
df.head()
| DAY_OF_MONTH | DAY_OF_WEEK | OP_UNIQUE_CARRIER | OP_CARRIER_AIRLINE_ID | OP_CARRIER | TAIL_NUM | OP_CARRIER_FL_NUM | ORIGIN_AIRPORT_ID | ORIGIN_AIRPORT_SEQ_ID | ORIGIN | ... | DEST | DEP_TIME | DEP_DEL15 | DEP_TIME_BLK | ARR_TIME | ARR_DEL15 | CANCELLED | DIVERTED | DISTANCE | Unnamed: 21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | EV | 20366 | EV | N48901 | 4397 | 13930 | 1393007 | ORD | ... | GRB | 1003.0 | 0.0 | 1000-1059 | 1117.0 | 0.0 | 0.0 | 0.0 | 174.0 | NaN |
| 1 | 1 | 3 | EV | 20366 | EV | N16976 | 4401 | 15370 | 1537002 | TUL | ... | ORD | 1027.0 | 0.0 | 1000-1059 | 1216.0 | 0.0 | 0.0 | 0.0 | 585.0 | NaN |
| 2 | 1 | 3 | EV | 20366 | EV | N12167 | 4404 | 11618 | 1161802 | EWR | ... | TYS | 1848.0 | 0.0 | 1800-1859 | 2120.0 | 0.0 | 0.0 | 0.0 | 631.0 | NaN |
| 3 | 1 | 3 | EV | 20366 | EV | N14902 | 4405 | 10781 | 1078105 | BTR | ... | IAH | 1846.0 | 0.0 | 1800-1859 | 2004.0 | 0.0 | 0.0 | 0.0 | 253.0 | NaN |
| 4 | 1 | 3 | EV | 20366 | EV | N606UX | 4407 | 14524 | 1452401 | RIC | ... | IAH | 1038.0 | 0.0 | 1000-1059 | 1330.0 | 0.0 | 0.0 | 0.0 | 1157.0 | NaN |
5 rows × 22 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 607346 entries, 0 to 607345 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DAY_OF_MONTH 607346 non-null int64 1 DAY_OF_WEEK 607346 non-null int64 2 OP_UNIQUE_CARRIER 607346 non-null object 3 OP_CARRIER_AIRLINE_ID 607346 non-null int64 4 OP_CARRIER 607346 non-null object 5 TAIL_NUM 606648 non-null object 6 OP_CARRIER_FL_NUM 607346 non-null int64 7 ORIGIN_AIRPORT_ID 607346 non-null int64 8 ORIGIN_AIRPORT_SEQ_ID 607346 non-null int64 9 ORIGIN 607346 non-null object 10 DEST_AIRPORT_ID 607346 non-null int64 11 DEST_AIRPORT_SEQ_ID 607346 non-null int64 12 DEST 607346 non-null object 13 DEP_TIME 600682 non-null float64 14 DEP_DEL15 600647 non-null float64 15 DEP_TIME_BLK 607346 non-null object 16 ARR_TIME 600271 non-null float64 17 ARR_DEL15 599268 non-null float64 18 CANCELLED 607346 non-null float64 19 DIVERTED 607346 non-null float64 20 DISTANCE 607346 non-null float64 21 Unnamed: 21 0 non-null float64 dtypes: float64(8), int64(8), object(6) memory usage: 101.9+ MB
df.isnull().sum()/len(df)*100
DAY_OF_MONTH 0.000000 DAY_OF_WEEK 0.000000 OP_UNIQUE_CARRIER 0.000000 OP_CARRIER_AIRLINE_ID 0.000000 OP_CARRIER 0.000000 TAIL_NUM 0.114926 OP_CARRIER_FL_NUM 0.000000 ORIGIN_AIRPORT_ID 0.000000 ORIGIN_AIRPORT_SEQ_ID 0.000000 ORIGIN 0.000000 DEST_AIRPORT_ID 0.000000 DEST_AIRPORT_SEQ_ID 0.000000 DEST 0.000000 DEP_TIME 1.097233 DEP_DEL15 1.102996 DEP_TIME_BLK 0.000000 ARR_TIME 1.164904 ARR_DEL15 1.330049 CANCELLED 0.000000 DIVERTED 0.000000 DISTANCE 0.000000 Unnamed: 21 100.000000 dtype: float64
df.drop('Unnamed: 21',axis=1,inplace=True)
df.head()
| DAY_OF_MONTH | DAY_OF_WEEK | OP_UNIQUE_CARRIER | OP_CARRIER_AIRLINE_ID | OP_CARRIER | TAIL_NUM | OP_CARRIER_FL_NUM | ORIGIN_AIRPORT_ID | ORIGIN_AIRPORT_SEQ_ID | ORIGIN | ... | DEST_AIRPORT_SEQ_ID | DEST | DEP_TIME | DEP_DEL15 | DEP_TIME_BLK | ARR_TIME | ARR_DEL15 | CANCELLED | DIVERTED | DISTANCE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | EV | 20366 | EV | N48901 | 4397 | 13930 | 1393007 | ORD | ... | 1197705 | GRB | 1003.0 | 0.0 | 1000-1059 | 1117.0 | 0.0 | 0.0 | 0.0 | 174.0 |
| 1 | 1 | 3 | EV | 20366 | EV | N16976 | 4401 | 15370 | 1537002 | TUL | ... | 1393007 | ORD | 1027.0 | 0.0 | 1000-1059 | 1216.0 | 0.0 | 0.0 | 0.0 | 585.0 |
| 2 | 1 | 3 | EV | 20366 | EV | N12167 | 4404 | 11618 | 1161802 | EWR | ... | 1541205 | TYS | 1848.0 | 0.0 | 1800-1859 | 2120.0 | 0.0 | 0.0 | 0.0 | 631.0 |
| 3 | 1 | 3 | EV | 20366 | EV | N14902 | 4405 | 10781 | 1078105 | BTR | ... | 1226603 | IAH | 1846.0 | 0.0 | 1800-1859 | 2004.0 | 0.0 | 0.0 | 0.0 | 253.0 |
| 4 | 1 | 3 | EV | 20366 | EV | N606UX | 4407 | 14524 | 1452401 | RIC | ... | 1226603 | IAH | 1038.0 | 0.0 | 1000-1059 | 1330.0 | 0.0 | 0.0 | 0.0 | 1157.0 |
5 rows × 21 columns
plt.figure(figsize=(14,6))
corr=df.corr(method='pearson')
heatmap=sns.heatmap(corr,annot=True,vmax=1,vmin=-1,linewidths=1,linecolor='Black')
plt.show()
num_col=df.select_dtypes(include=("int",'float')).columns
num_col
Index(['DAY_OF_WEEK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'DEP_TIME',
'DEP_DEL15', 'ARR_TIME', 'ARR_DEL15', 'DISTANCE'],
dtype='object')
df.tail()
| DAY_OF_WEEK | ORIGIN_AIRPORT_ID | DEST_AIRPORT_ID | DEP_TIME | DEP_DEL15 | DEP_TIME_BLK | ARR_TIME | ARR_DEL15 | DISTANCE | |
|---|---|---|---|---|---|---|---|---|---|
| 607341 | 5 | 15412 | 11433 | 1002.0 | 1.0 | 0700-0759 | 1128.0 | 1.0 | 443.0 |
| 607342 | 5 | 11433 | 12478 | 1747.0 | 0.0 | 1700-1759 | 1933.0 | 0.0 | 509.0 |
| 607343 | 5 | 11996 | 12953 | 554.0 | 0.0 | 0600-0659 | 752.0 | 0.0 | 610.0 |
| 607344 | 5 | 10397 | 15919 | 1714.0 | 0.0 | 1700-1759 | 1811.0 | 0.0 | 589.0 |
| 607345 | 5 | 15919 | 10397 | 1841.0 | 0.0 | 1800-1859 | 2128.0 | 0.0 | 589.0 |
from plotly.subplots import make_subplots
import plotly.graph_objects as go
values = [df['DEP_DEL15'].value_counts()[0],df['DEP_DEL15'].value_counts()[1]]
labels = ["Delayed (<15 minutes)", "Delayed (>15 minutes)"]
colors = ['blue','red']
values_arr = [df['ARR_DEL15'].value_counts()[0],df['ARR_DEL15'].value_counts()[1]]
fig = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]], subplot_titles=['Departures', 'Arrivals'])
fig.add_trace(go.Pie(labels=labels, values=values, pull=[0, 0.1],textinfo = 'label+percent'),1,1)
fig.add_trace(go.Pie(labels=labels, values=values_arr, pull=[0, 0.1],textinfo = 'label+percent'),1,2)
fig.update_traces( textinfo='value', textfont_size=14, marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(uniformtext_minsize=4, uniformtext_mode='hide', title_text='Flights Delayed', title_x = 0.5, legend_title='<b>Flights</b>',legend=dict(x=.45, y=0.6))
# Set the width and height of the figure
plt.figure(figsize=(12,6))
# Add title
plt.title("Flight delay on week days")
# Bar chart showing average arrival delay for Spirit Airlines flights by month
sns.barplot(x=df.DAY_OF_WEEK, y=df['ARR_DEL15'])
# Add label for vertical axis
plt.ylabel("Arrival delay (in minutes)")
Text(0, 0.5, 'Arrival delay (in minutes)')
#drop flights with missing data
df = df[(df.CANCELLED != 1) & (df.DEP_TIME.isna() == False) & (df.ARR_TIME.isna() == False)]
len(df.CANCELLED.isna()), len(df.DEP_TIME.isna()), len(df.ARR_TIME.isna()), len(df)
#drop when target is NAN
df = df[(df.ARR_DEL15.isna() == False)]
df.isna().sum()
df.drop('DEP_TIME',axis=1,inplace=True)
df.drop('DEP_DEL15',axis=1,inplace=True)
df.loc[df['DEP_TIME_BLK']=='0001-0559', 'DEP_TIME_BLK'] = '0500-0559'
print(df.loc[[211]])
DAY_OF_WEEK ORIGIN_AIRPORT_ID DEST_AIRPORT_ID DEP_TIME DEP_DEL15 \
211 3 11140 12266 1207.0 0.0
DEP_TIME_BLK ARR_TIME ARR_DEL15 DISTANCE
211 1200-1259 1259.0 0.0 201.0
cols=['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'DIVERTED', 'CANCELLED',
'ORIGIN', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_SEQ_ID',
'DEST_AIRPORT_SEQ_ID', 'DEST', 'DAY_OF_MONTH', 'OP_UNIQUE_CARRIER']
df=df.drop(cols,axis=1)
df.isna().sum()
def get_day_category(day_of_week):
if day_of_week <= 5:
return 0 #'for working day'
elif day_of_week > 5:
return 1 #'for weekend'
df.DAY_OF_WEEK = df.DAY_OF_WEEK.apply(get_day_category)
df.rename(columns={'DAY_OF_WEEK':'type_of_day'})
| type_of_day | ORIGIN_AIRPORT_ID | DEST_AIRPORT_ID | DEP_TIME | DEP_DEL15 | DEP_TIME_BLK | ARR_TIME | ARR_DEL15 | DISTANCE | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 13930 | 11977 | 1003.0 | 0.0 | 1000-1059 | 1117.0 | 0.0 | 174.0 |
| 1 | 0 | 15370 | 13930 | 1027.0 | 0.0 | 1000-1059 | 1216.0 | 0.0 | 585.0 |
| 2 | 0 | 11618 | 15412 | 1848.0 | 0.0 | 1800-1859 | 2120.0 | 0.0 | 631.0 |
| 3 | 0 | 10781 | 12266 | 1846.0 | 0.0 | 1800-1859 | 2004.0 | 0.0 | 253.0 |
| 4 | 0 | 14524 | 12266 | 1038.0 | 0.0 | 1000-1059 | 1330.0 | 0.0 | 1157.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 607341 | 0 | 15412 | 11433 | 1002.0 | 1.0 | 0700-0759 | 1128.0 | 1.0 | 443.0 |
| 607342 | 0 | 11433 | 12478 | 1747.0 | 0.0 | 1700-1759 | 1933.0 | 0.0 | 509.0 |
| 607343 | 0 | 11996 | 12953 | 554.0 | 0.0 | 0600-0659 | 752.0 | 0.0 | 610.0 |
| 607344 | 0 | 10397 | 15919 | 1714.0 | 0.0 | 1700-1759 | 1811.0 | 0.0 | 589.0 |
| 607345 | 0 | 15919 | 10397 | 1841.0 | 0.0 | 1800-1859 | 2128.0 | 0.0 | 589.0 |
599268 rows × 9 columns
df['origin_day_avg'] = df.groupby(['ORIGIN_AIRPORT_ID','DAY_OF_WEEK'])['DEP_TIME'].transform('mean').astype(np.float16)
df['dest_day_avg'] = df.groupby(['DEST_AIRPORT_ID','DAY_OF_WEEK'])['ARR_TIME'].transform('mean').astype(np.float16)
#replace missing target values
def new_arrival_delay(dep_delay, arr_delay):
if np.isnan(arr_delay):
return dep_delay
else:
return arr_delay
df['ARR_DEL15'] = df.apply(lambda row :new_arrival_delay(row['DEP_DEL15'],row['ARR_DEL15']), axis = 1)
# CREATE ARR_TIME_BLK
#generate block hours
blocks = []
for hour in range(0,24):
hour_part = ('%02d' %(hour))
blocks.append(hour_part + '00-' + hour_part + '59')
blocks
def arrival_time_blk(arr_time):
arr_hour = str('%04d' %(arr_time))[:2]
arr_block = None
for block in blocks:
#print (block,arr_hour)
if block.startswith(arr_hour):
arr_block = block
break
if arr_block == None and str(arr_time) == '2400.0':
arr_block = '0000-0059'
#print('Cannot find block for #' + str(arr_time) + '#: set block to #' + arr_block + '#')
return arr_block
df['ARR_TIME_BLK'] = df.ARR_TIME.apply(arrival_time_blk)
# drop the no more useful ARR_TIME
df.drop(['ARR_TIME'], inplace=True, axis=1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(blocks)
le.classes_
df['ARR_TIME_BLK'] = le.transform(df.ARR_TIME_BLK.values)
df['DEP_TIME_BLK'] = le.transform(df.DEP_TIME_BLK.values)
df=df.reindex(sorted(df.columns), axis=1)
df=df.drop(['DEP_TIME','DEP_DEL15'], axis=1,inplace=True)
from sklearn.model_selection import train_test_split
y = df['ARR_DEL15'].values
X = df.drop(['ARR_DEL15'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=123)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
(479414, 8) (479414,) (119854, 8) (119854,)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=13)
model.fit(X_train, y_train)
RandomForestClassifier(random_state=13)
RandomForestClassifier(random_state=13)
RandomForestClassifier(random_state=13)
predicted = model.predict(X_test)
model.score(X_test, y_test)
0.9125686251606121
#testing the model
from sklearn.metrics import classification_report,confusion_matrix,f1_score
model_predictions = model.predict(X_test)
print("confusion matrix is:\n",confusion_matrix(y_test, model_predictions))
print("\nClassification report is:\n",classification_report(y_test, model_predictions))
confusion matrix is:
[[101118 2222]
[ 7742 8772]]
Classification report is:
precision recall f1-score support
0.0 0.93 0.98 0.95 103340
1.0 0.80 0.53 0.64 16514
accuracy 0.92 119854
macro avg 0.86 0.75 0.80 119854
weighted avg 0.91 0.92 0.91 119854
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
DecisionTreeClassifier()
predicted = model.predict(X_test)
model.score(X_test, y_test)
0.916865519715654
#testing the model
from sklearn.metrics import classification_report,confusion_matrix,f1_score
model_predictions = model.predict(X_test)
print("confusion matrix is:\n",confusion_matrix(y_test, model_predictions))
print("\nClassification report is:\n",classification_report(y_test, model_predictions))
confusion matrix is:
[[101118 2222]
[ 7742 8772]]
Classification report is:
precision recall f1-score support
0.0 0.93 0.98 0.95 103340
1.0 0.80 0.53 0.64 16514
accuracy 0.92 119854
macro avg 0.86 0.75 0.80 119854
weighted avg 0.91 0.92 0.91 119854
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
KNeighborsClassifier()
predicted = model.predict(X_test)
model.score(X_test, y_test)
0.8730789126770905
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
LogisticRegression()
predicted = model.predict(X_test)
model.score(X_test, y_test)
0.8622156957631786